In [ ]:

    
%matplotlib nbagg
import matplotlib.pyplot as plt
import numpy as np



In [ ]:

    
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
import numpy as np
np.set_printoptions(suppress=True)

digits = load_digits()
X, y = digits.data, digits.target
X_train, X_test, y_train, y_test = train_test_split(X, y)

Removing mean and scaling variance



In [ ]:

    
from sklearn.preprocessing import StandardScaler

1) Instantiate the model



In [ ]:

    
scaler = StandardScaler()

2) Fit using only the data.



In [ ]:

    
scaler.fit(X_train)

3) transform the data (not predict).



In [ ]:

    
X_train_scaled = scaler.transform(X_train)



In [ ]:

    
X_train.shape



In [ ]:

    
X_train_scaled.shape

The transformed version of the data has the mean removed:



In [ ]:

    
X_train_scaled.mean(axis=0)



In [ ]:

    
X_train_scaled.std(axis=0)



In [ ]:

    
X_test_transformed = scaler.transform(X_test)

Principal Component Analysis

0) Import the model



In [ ]:

    
from sklearn.decomposition import PCA

1) Instantiate the model



In [ ]:

    
pca = PCA(n_components=2)

2) Fit to training data



In [ ]:

    
pca.fit(X)

3) Transform to lower-dimensional representation



In [ ]:

    
print(X.shape)
X_pca = pca.transform(X)
X_pca.shape

Visualize



In [ ]:

    
plt.figure()
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)



In [ ]:

    
pca.components_.shape



In [ ]:

    
plt.matshow(pca.components_[0].reshape(8, 8), cmap="gray")
plt.colorbar()
plt.matshow(pca.components_[1].reshape(8, 8), cmap="gray")
plt.colorbar()

Manifold Learning



In [ ]:

    
from sklearn.manifold import Isomap
isomap = Isomap()



In [ ]:

    
X_isomap = isomap.fit_transform(X)



In [ ]:

    
plt.scatter(X_isomap[:, 0], X_isomap[:, 1], c=y)

Exercises

Visualize the digits dataset using the TSNE algorithm from the sklearn.manifold module (it runs for a couple of seconds).
Extract non-negative components from the digits dataset using NMF. Visualize the resulting components. The interface of NMF is identical to the PCA one. What qualitative difference can you find compared to PCA?



In [ ]:

    
# %load solutions/digits_unsupervised.py
from sklearn.manifold import TSNE
from sklearn.decomposition import NMF

# Compute TSNE embedding
tsne = TSNE()
X_tsne = tsne.fit_transform(X)

# Visualize TSNE results
plt.title("All classes")
plt.figure()
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)

# build an NMF factorization of the digits dataset
nmf = NMF(n_components=16).fit(X)

# visualize the components
fig, axes = plt.subplots(4, 4)
for ax, component in zip(axes.ravel(), nmf.components_):
    ax.imshow(component.reshape(8, 8), cmap="gray", interpolation="nearest")



In [ ]: